@article{Andrade_GLBIO2025,
    author = {Andrade, Aixa AND  Nguyen, Son AND Montillo, Albert},
    journal = {Great Lakes Bioinformatics Conference},
    publisher = {Great Lakes Bioinformatics},
    title = {Autoencoder Mixed Effects Deep Learning for the interpretable analysis of scRNA-seq data by separately modeling batch-specific and -agnostic effects},
    year = {2025},
    month = {05},
    volume = {2025},
    abstract = {Single-cell RNA sequencing data can provide unprecedented insights into cellular heterogeneity, yet batch effects arising from both technical and biological factors can obscure meaningful signals. We propose an autoencoder Mixed Effects Deep Learning framework, called aMEDL, that separately models batch-invariant and batch-specific variation to improve the suppression of batch effects, while preserving biologically relevant information. The aMEDL framework comprises two complementary autoencoder networks: an adversarial network that learns a batch-invariant representation, and a probabilistic network that learns batch-specific signals. This dual network approach explicitly models batch distributions rather than discarding them, capturing crucial biological variation that might otherwise be lost. We evaluate aMEDL across diverse datasets, including a single-cell dataset from cardiovascular tissue of healthy donors1 and a single-nucleus dataset from subjects with Autism Spectrum Disorder (ASD) and Typically Developing (TD) individuals2 The framework is compared to the traditional method for scRNA-seq processing, principal component analysis (PCA), and to a newer neural network approach for data abstraction that uses a single autoencoder (AE) network. In both cases, the proposed framework outperforms the comparable methods. In the Healthy Heart dataset, while measuring batch separability via the mean Average Silhouette Width (ASW) with a range of -1.0 to +1.0, we find that aMEDL’s random effects subnetwork accurately captures batch differences (higher is better) with an ASW of +0.37, outperforming PCA (−0.48) and AE (−0.45). Meanwhile, its fixed effects component effectively suppresses batch signals in the latent space (lower is better), with an ASW of −0.50 compared to −0.48 (PCA) and −0.45 (AE). Additionally, using UMAP-based visualizations, aMEDL is observed regularly outperforming the comparable methods. For example in the ASD dataset, it preserved cell type information that PCA did not and avoided spurious clusters observed from the AE approach. Similar favorable results were obtained in the ASD dataset, where the random effects subnetwork reliably captured donor-specific variations, demonstrating aMEDL’s ability to disentangle donor variability from shared biological signals. Overall, aMEDL not only eliminates undesired batch effects, but also maintains batch-specific differences, preventing overcorrection and false clustering. As the first deep learning framework to simultaneously model batch-invariant and batch-specific signals, aMEDL provides an interpretable, generative platform for uncovering disease mechanisms, donor variability, and technical artifacts in single-cell transcriptomics, ultimately paving the way for deeper insights into health and disease. 
},


}